Simulations with a single SNP & different sample sizes

1 Setup

First, simulate genotype data. Then, for each subject’s genotype, simulate replicate traits.

Code
library(magrittr)
Code
set.seed(2023-04-15)
n_tr <- 10000
n_te <- 100
n <- n_tr + n_te
# n is total number of subjects
n_reps <- 50
# n_reps is number of replicates
n_snp <- 1
# n_snp is number of SNPs
# simulate genotypes matrix
maf <- 0.5
geno <- matrix(sample(c(0,1,2), 
                        n*n_snp, 
                        replace=TRUE, 
                        prob=c(maf^2,2 * maf * (1-maf), (1-maf)^2)), 
                nrow=n, 
                ncol=n_snp)
# simulate phenotype
beta <- 1
pheno_list <- list()
error_vars <- c(0.5, 1, 2, 4.5)
for (error_var in error_vars){
    error_term <- matrix(data = rnorm(n * n_reps, sd = sqrt(error_var)), 
                        nrow = n, 
                        ncol = n_reps)
    y <- as.numeric(geno %*% beta) %*% t(rep(1, n_reps)) + error_term
    # prepare for splitting into training and testing
    # get test subject ids
    test_ids <- sample(1:n, n_te, replace = FALSE)
    # organize data
    dat <- tibble::as_tibble(y) %>%
        dplyr::rename_with(function(x){ num <- stringr::str_extract(x, "[0-9]+")
                                        return(paste0("pheno", num))}
                            ) %>%
        dplyr::bind_cols(geno %>% tibble::as_tibble() %>% dplyr::rename(geno = 1)) %>%
        dplyr::mutate(id = 1:n) %>% # fix this when using more than one SNP
        dplyr::mutate(in_test_set = id %in% test_ids)
    # split into training and testing
    training <- dat %>% dplyr::filter(!in_test_set)
    testing <- dat %>% dplyr::filter(in_test_set)
    testing2 <- testing %>% dplyr::mutate(fold = as.integer(NA))
    pheno_list[[as.character(error_var)]] <- list(training, testing2)
}
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
Code
# use all training with leave K out
alpha <- 0.1

Above, we specified the number of replicates for the simulations. We created 50 replicate traits for the same 1.01^{4} subjects. Note that each subject has only 1 SNP(s).

2 python code, adapted from Barber’s code for JK+ paper

Code
import numpy as np
import pandas as pd

def leastsq_minL2(X,Y,X1,tol=1e-8):
    uX,dX,vX = np.linalg.svd(X)
    rX = (dX>=dX[0]*tol).sum()
    betahat = (vX[:rX].T/dX[:rX]).dot(uX[:,:rX].T.dot(Y))
    return X1.dot(betahat)

def compute_PIs(X,Y,X1,alpha = 0.1, K=10, fit_muh_fun = leastsq_minL2):
    n = len(Y)
    #n1 = X1.shape[0]
    n1 = len(X1)
    ###############################
    # CV+
    ###############################
    n_K = np.floor(n/K).astype(int)
    base_inds_to_delete = np.arange(n_K).astype(int)
    resids_LKO = np.zeros(n)
    muh_LKO_vals_testpoint = np.zeros((n,n1))
    for i in range(K):
        inds_to_delete = (base_inds_to_delete + n_K*i).astype(int)
        X0 = np.delete(X, inds_to_delete)
        X0_2d = X0[:, np.newaxis]
        Y0 = np.delete(Y, inds_to_delete)
        X0d = X[inds_to_delete]
        X0d_2d = X0d[:, np.newaxis]
        X1_2d = X1[:, np.newaxis]
        muh_vals_LKO = fit_muh_fun(X0_2d, Y0, np.r_[X0d_2d, X1_2d])
        resids_LKO[inds_to_delete] = np.abs(Y[inds_to_delete] - muh_vals_LKO[:n_K, 0])
        for inner_K in range(n_K):
            muh_LKO_vals_testpoint[inds_to_delete[inner_K]] = muh_vals_LKO[n_K:, 0]
    ind_Kq = (np.ceil((1-alpha)*(n+1))).astype(int)
    PIs_dict = {'CV+' : pd.DataFrame(\
                    np.c_[np.sort(muh_LKO_vals_testpoint.T - resids_LKO,axis=1).T[-ind_Kq], \
                        np.sort(muh_LKO_vals_testpoint.T + resids_LKO,axis=1).T[ind_Kq-1]],\
                           columns = ['lower','upper'])}
    return pd.concat(PIs_dict.values(), axis=1, keys=PIs_dict.keys())
Code
np.random.seed(98765)
# simulation
n_vals = np.array([100, 1000, 10000, 100000]) # training set size
n1 = 100 # test set size
#SNR = 10
ntrial = 50
#dim_vals = np.arange(5,205,5)
d = 1
#beta = np.random.normal(size=d)
#beta = beta/np.sqrt((beta**2).sum()) * np.sqrt(SNR)
beta = np.array([1])
#X1 = np.random.normal(size=(n1,d))
vector = [0, 1, 2]
X1 = np.random.choice(vector, size=(n1, d), replace=True)
Y1 = X1.dot(beta) + np.random.normal(size=n1)
method = 'CV+'
# define new objects for storing Y and X
Y_dic = dict.fromkeys(n_vals, 0)
results = pd.DataFrame(columns = ['itrial','d','method','coverage','width', 'n', 'K'])
K_vec = np.array([10, 20, 50])
# loop over n_vals
for n in n_vals:
    Ya = np.zeros((n, ntrial))
    X = np.random.choice(vector, size=(n, d), replace=True)
    for itrial in range(ntrial):
        Y = X.dot(beta) + np.random.normal(size=n)
        Ya[:, itrial] = Y
        # store X, Y for later use with R code
        # store as numpy arrays
        for K in K_vec:
            PIs = compute_PIs(X = X, Y = Y, X1 = X1, K = K)
            coverage = ((PIs[method]['lower'] <= Y1)&(PIs[method]['upper'] >= Y1)).mean()
            width = (PIs[method]['upper'] - PIs[method]['lower']).mean()
            results.loc[len(results)]=[itrial,d,method,coverage,width,n,K]
results.to_csv('python-results.csv')
Code
library(reticulate)
saveRDS(py$results, file = "python-results.rds")
library(ggplot2)
p <- py$results %>%
    ggplot() + geom_boxplot(aes(x = as.factor(n), y = width, fill = as.factor(n)))
plotly::ggplotly(p)
Code
py$results %>%
    tibble::as_tibble() %>%
    dplyr::group_by(n) %>%
    dplyr::summarize(mean_width = mean(width), sd_width = sd(width)) %>%
    knitr::kable()

3 Back to R-based analysis

Code
if (!file.exists("res_list.rds") | !file.exists("outermost_list.rds")){
    res_list <- list()
    outermost_list <- list()
    n_folds_vec <- c(5, 10, 20, 50)
    n_folds <- 10
    for (error_var in error_vars){
    #for (n_folds in n_folds_vec){
        # partition training data into K folds
        training <- pheno_list[[as.character(error_var)]][[1]]
        testing2 <- pheno_list[[as.character(error_var)]][[2]]
        folds <- split(training$id, sample(rep(1:n_folds, length.out = n_tr)))
        training2_pre <- training %>% 
            dplyr::mutate(fold = id %>% purrr::map_int(~which(sapply(folds, function(x) . %in% x)))) %>%
            dplyr::arrange(fold)
        tictoc::tic() # timing
        tl <- list()
        n_per_fold_vec <- c(n_tr / n_folds, n_tr / (n_folds * 10))
        vars_list_list <- list()
        for (n_per_fold in n_per_fold_vec){
            training2 <- training2_pre %>%
                dplyr::group_by(fold) %>%
                dplyr::slice_sample(n = n_per_fold) %>%
                dplyr::ungroup()
            # store each trait's outputs
            out <- list()
            vars_list <- list()
            # loop over traits
            for (trait_num in 1:n_reps){
                tr2_one_trait <- training2 %>%
                    dplyr::select(id, fold, geno, tidyselect::ends_with(paste0("pheno", trait_num))) %>%
                    dplyr::rename(pheno = tidyselect::ends_with(paste0("pheno", trait_num)))
                te2_one_trait <- testing2 %>%
                    dplyr::select(id, fold, geno, tidyselect::ends_with(paste0("pheno", trait_num))) %>%
                    dplyr::rename(pheno = tidyselect::ends_with(paste0("pheno", trait_num)))
                
                # regress leaving one fold out
                preds <- list()
                vars <- list()
                for (fold_num in 1:n_folds) {
                    # get training data
                    train <- tr2_one_trait %>% dplyr::filter(fold != fold_num)
                    # get testing data
                    test <- tr2_one_trait %>% dplyr::filter(fold == fold_num)
                    # fit model
                    fit <- lm(pheno ~ 1 + geno, data = train)
                    # predict
                    foo <- test %>% dplyr::bind_rows(te2_one_trait)
                    foo$pred <- predict(fit, newdata = foo)
                    foo$fold_left_out <- fold_num
                    result <- foo %>%
                        dplyr::mutate(beta1_hat = coef(fit)[2],
                                    beta0_hat = coef(fit)[1],
                                    se_beta1_hat = summary(fit)$coefficients[2, 2],
                                    se_beta0_hat = summary(fit)$coefficients[1, 2]
                        )
                    # save predictions
                    preds[[fold_num]] <- result
                    te_geno_mat <- cbind(1, te2_one_trait$geno)
                    tr_geno_mat <- cbind(1, train$geno)
                    vars[[fold_num]] <- diag(te_geno_mat %*% solve(t(tr_geno_mat) %*% tr_geno_mat) %*% t(te_geno_mat))
                }
                vars_list[[trait_num]] <- vars
                # assemble predicted values
                # get absolute residuals
                preds_training <- preds %>%
                    dplyr::bind_rows() %>%
                    dplyr::filter(!is.na(fold)) %>% # keep only training data
                    dplyr::mutate(absolute_residual = abs(pheno - pred)) %>%
                    dplyr::select( - fold_left_out)
                preds_test <- preds %>%
                    dplyr::bind_rows() %>%
                    dplyr::filter(is.na(fold))
                # get indexes
                plus_index <- ceiling((1 - alpha) * (nrow(preds_training) + 1))
                minus_index <- floor(alpha * (nrow(preds_training) + 1))
            
                # go one by one through test set (testing2)
                test_list <- list()
                for (i in 1:nrow(testing2)){
                    tt <- testing2[i, ]
                    pt2 <- preds_test %>% 
                        dplyr::filter(id == tt$id) %>% # our only use of tt
                        dplyr::rename_with(function(x)paste0("test_", x)) 
                        # pt2 contains the five predicted values for a single test subject
                    nrow(pt2) # 5
                    preds_all <- preds_training %>%
                        dplyr::left_join(pt2, by = c("fold" = "test_fold_left_out")) %>%
                        dplyr::mutate(test_fitted_plus_absolute_residual = test_pred + absolute_residual, 
                                    test_fitted_minus_absolute_residual = test_pred - absolute_residual) 
                    uu <- sort(preds_all$test_fitted_plus_absolute_residual)[plus_index]
                    ll <- sort(preds_all$test_fitted_minus_absolute_residual)[minus_index]
                    # make a tibble with exactly one row
                    test_list[[i]] <- preds_all %>%
                        dplyr::select(test_id, test_geno, test_pheno, test_beta1_hat, fold) %>%
                        dplyr::mutate(lower = ll, upper = uu) %>%
                        dplyr::distinct() %>%
                        tidyr::pivot_wider(names_from = fold, 
                                            values_from = test_beta1_hat,
                                            names_prefix = "beta1_hat_fold_"
                                            )
                }
                test_tib <- test_list %>%
                    dplyr::bind_rows() %>%
                    dplyr::mutate(in_interval = test_pheno >= lower & test_pheno <= upper) %>%
                    dplyr::mutate(interval_width = upper - lower) %>%
                    dplyr::mutate(training_set_size = n_per_fold * n_folds,
                                    trait_num = trait_num)
                out[[trait_num]] <- test_tib
            }
            tl[[as.character(n_per_fold * n_folds)]]  <- out
            vars_list_list[[as.character(n_per_fold * n_folds)]] <- vars_list
        }
        res_list[[as.character(error_var)]] <- tl
        outermost_list[[as.character(error_var)]] <- vars_list_list
        tictoc::toc() # timing
    }
    saveRDS(res_list, "res_list.rds")
    saveRDS(outermost_list, "outermost_list.rds")
}
readRDS("res_list.rds") -> res_list
readRDS("outermost_list.rds") -> outermost_list

4 Organize results

Code
modified_list <- res_list %>%
    purrr::map(~ .x %>% 
                    dplyr::bind_rows(.id = "id")) 
modified_list2 <- lapply(names(modified_list), function(name) {
  transform(modified_list[[name]], list_name = name)
}) %>%
    dplyr::bind_rows()

results <- modified_list2 %>%
    dplyr::group_by(list_name, training_set_size) %>%
    dplyr::summarise(mean_interval_width = mean(interval_width),
                    sd_interval_width = sd(interval_width),
                    mean_in_interval = mean(in_interval),
                    sd_in_interval = sd(in_interval), 
                    median_interval_width = median(interval_width)
                    )  
`summarise()` has grouped output by 'list_name'. You can override using the
`.groups` argument.
Code
results %>%
    knitr::kable() %>%
    print()
list_name training_set_size mean_interval_width sd_interval_width mean_in_interval sd_in_interval median_interval_width
0.5 1000 2.329543 0.0516329 0.8950 0.3065841 2.333283
0.5 10000 2.326070 0.0149779 0.8958 0.3055504 2.328521
1 1000 3.311668 0.0923029 0.8956 0.3058093 3.320546
1 10000 3.292276 0.0273718 0.8952 0.3063262 3.292118
2 1000 4.654873 0.1283834 0.8884 0.3149052 4.668523
2 10000 4.655773 0.0444416 0.8926 0.3096521 4.652075
4.5 1000 7.033196 0.2142262 0.8974 0.3034661 7.052161
4.5 10000 6.985217 0.0652323 0.8966 0.3045111 6.979070

5 Figures

5.1 Boxplots for interval width

Code
library(ggplot2)
p1 <- modified_list2 %>%
    ggplot(aes(y = interval_width, colour = as.factor(training_set_size), x = as.factor(list_name)))  +
    geom_boxplot()
p1

Code
ggsave(here::here("figures", "interval_width_boxplot.png"), width = 10, height = 10)
plotly::ggplotly(p1, tooltip = c("x", "y", "colour"))

5.2 Relationship between \(\hat\beta\) and median interval width

Code
p1 <- results %>%
    ggplot(aes(x = mean_b1, y = median_interval_width, colour = as.factor(list_name), replicate_num = trait_num, training_set_size = training_set_size)) +
    geom_point()
plotly::ggplotly(p1, tooltip = c("x", "y", "colour", "replicate_num", "training_set_size"))

6 Session Info

Code
sessioninfo::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.3.0 (2023-04-21)
 os       Ubuntu 18.04.6 LTS
 system   x86_64, linux-gnu
 ui       X11
 language en_US:
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/Detroit
 date     2023-05-22
 pandoc   1.19.2.4 @ /usr/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package     * version date (UTC) lib source
 cli           3.6.1   2023-03-23 [1] CRAN (R 4.2.3)
 colorspace    2.1-0   2023-01-23 [1] CRAN (R 4.2.2)
 crosstalk     1.2.0   2021-11-04 [1] CRAN (R 4.1.2)
 data.table    1.14.8  2023-02-17 [1] CRAN (R 4.2.2)
 digest        0.6.31  2022-12-11 [1] CRAN (R 4.2.2)
 dplyr         1.1.2   2023-04-20 [1] CRAN (R 4.3.0)
 ellipsis      0.3.2   2021-04-29 [2] CRAN (R 4.2.1)
 evaluate      0.21    2023-05-05 [1] CRAN (R 4.3.0)
 fansi         1.0.4   2023-01-22 [1] CRAN (R 4.2.2)
 farver        2.1.1   2022-07-06 [1] CRAN (R 4.2.3)
 fastmap       1.1.1   2023-02-24 [1] CRAN (R 4.2.3)
 generics      0.1.3   2022-07-05 [1] CRAN (R 4.2.3)
 ggplot2     * 3.4.2   2023-04-03 [1] CRAN (R 4.2.3)
 glue          1.6.2   2022-02-24 [1] CRAN (R 4.2.0)
 gtable        0.3.3   2023-03-21 [1] CRAN (R 4.2.3)
 here          1.0.1   2020-12-13 [2] CRAN (R 4.1.1)
 htmltools     0.5.5   2023-03-23 [1] CRAN (R 4.2.3)
 htmlwidgets   1.6.2   2023-03-17 [1] CRAN (R 4.2.2)
 httr          1.4.6   2023-05-08 [1] CRAN (R 4.3.0)
 jsonlite      1.8.4   2022-12-06 [1] CRAN (R 4.2.3)
 knitr         1.42    2023-01-25 [1] CRAN (R 4.2.3)
 labeling      0.4.2   2020-10-20 [2] CRAN (R 4.0.3)
 lattice       0.21-8  2023-04-05 [1] CRAN (R 4.2.3)
 lazyeval      0.2.2   2019-03-15 [2] CRAN (R 4.0.3)
 lifecycle     1.0.3   2022-10-07 [1] CRAN (R 4.2.2)
 magrittr    * 2.0.3   2022-03-30 [1] CRAN (R 4.2.0)
 Matrix        1.5-4.1 2023-05-18 [1] CRAN (R 4.3.0)
 munsell       0.5.0   2018-06-12 [2] CRAN (R 4.0.3)
 pillar        1.9.0   2023-03-22 [1] CRAN (R 4.2.3)
 pkgconfig     2.0.3   2019-09-22 [2] CRAN (R 4.0.3)
 plotly        4.10.1  2022-11-07 [1] CRAN (R 4.2.2)
 png           0.1-8   2022-11-29 [1] CRAN (R 4.2.3)
 purrr         1.0.1   2023-01-10 [1] CRAN (R 4.2.2)
 R6            2.5.1   2021-08-19 [2] CRAN (R 4.1.1)
 ragg          1.2.5   2023-01-12 [1] CRAN (R 4.3.0)
 rappdirs      0.3.3   2021-01-31 [2] CRAN (R 4.0.3)
 Rcpp          1.0.10  2023-01-22 [1] CRAN (R 4.2.2)
 reticulate    1.28    2023-01-27 [1] CRAN (R 4.3.0)
 rlang         1.1.1   2023-04-28 [1] CRAN (R 4.3.0)
 rmarkdown     2.21    2023-03-26 [1] CRAN (R 4.2.3)
 rprojroot     2.0.3   2022-04-02 [2] CRAN (R 4.2.0)
 scales        1.2.1   2022-08-20 [1] CRAN (R 4.2.3)
 sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.1.2)
 stringi       1.7.12  2023-01-11 [1] CRAN (R 4.2.2)
 stringr       1.5.0   2022-12-02 [1] CRAN (R 4.2.3)
 systemfonts   1.0.4   2022-02-11 [2] CRAN (R 4.2.0)
 textshaping   0.3.6   2021-10-13 [1] CRAN (R 4.2.2)
 tibble        3.2.1   2023-03-20 [1] CRAN (R 4.2.3)
 tidyr         1.3.0   2023-01-24 [1] CRAN (R 4.2.3)
 tidyselect    1.2.0   2022-10-10 [1] CRAN (R 4.2.2)
 utf8          1.2.3   2023-01-31 [1] CRAN (R 4.2.3)
 vctrs         0.6.2   2023-04-19 [1] CRAN (R 4.3.0)
 viridisLite   0.4.2   2023-05-02 [1] CRAN (R 4.3.0)
 withr         2.5.0   2022-03-03 [1] CRAN (R 4.2.0)
 xfun          0.39    2023-04-20 [1] CRAN (R 4.3.0)
 yaml          2.3.7   2023-01-23 [1] CRAN (R 4.2.3)

 [1] /net/mulan/home/fredboe/R/x86_64-pc-linux-gnu-library/4.0
 [2] /net/mario/cluster/lib/R/site-library-bionic-40
 [3] /usr/local/lib/R/site-library
 [4] /usr/lib/R/site-library
 [5] /usr/lib/R/library

─ Python configuration ───────────────────────────────────────────────────────
 python:         /net/mulan/home/fredboe/miniconda3/envs/barber1/bin/python3
 libpython:      /net/mulan/home/fredboe/miniconda3/envs/barber1/lib/libpython3.10.so
 pythonhome:     /net/mulan/home/fredboe/miniconda3/envs/barber1:/net/mulan/home/fredboe/miniconda3/envs/barber1
 version:        3.10.9 (main, Mar  8 2023, 10:47:38) [GCC 11.2.0]
 numpy:          /net/mulan/home/fredboe/miniconda3/envs/barber1/lib/python3.10/site-packages/numpy
 numpy_version:  1.23.5
 
 python versions found: 
  /net/mulan/home/fredboe/miniconda3/envs/barber1/bin/python3
  /net/mulan/home/fredboe/miniconda3/envs/barber1/bin/python
  /net/mulan/home/fredboe/miniconda3/bin/python
  /net/mulan/home/fredboe/miniconda3/envs/ldsc/bin/python
  /net/mulan/home/fredboe/miniconda3/envs/ldsc1.18/bin/python

──────────────────────────────────────────────────────────────────────────────
Code
# git commit info
gr <- git2r::repository(here::here()) %>%
    git2r::commits()
gr[[1]] 
[f5ff25a] 2023-05-22: feat: added interactivity to the boxplot with ggplotly